{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "4f3CKqFUqL2-"
   },
   "source": [
    "# Synthetic Features and Outliers"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "jnKgkN5fHbGy"
   },
   "source": [
    "**Learning Objectives:**\n",
    "  * Create a synthetic feature that is the ratio of two other features\n",
    "  * Use this new feature as an input to a linear regression model\n",
    "  * Improve the effectiveness of the model by identifying and clipping (removing) outliers out of the input data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "S8gm6BpqRRuh"
   },
   "source": [
    "## Setup"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install latest `2.x.x` release for tensorflow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install tensorflow==2.0.0-beta1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from matplotlib import cm\n",
    "from matplotlib import gridspec\n",
    "from matplotlib import pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import logging\n",
    "from packaging import version\n",
    "from IPython.display import display"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.options.display.max_rows = 10\n",
    "pd.options.display.float_format = '{:.1f}'.format\n",
    "logging.getLogger('tensorflow').disabled = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 470
    },
    "colab_type": "code",
    "id": "9D8GgUovHbG0",
    "outputId": "e93d9eed-bea7-4247-891b-8074bad9df1e"
   },
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "\n",
    "%load_ext tensorboard\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datetime import datetime\n",
    "import io\n",
    "\n",
    "logging.getLogger('tensorboard').disabled = True\n",
    "\n",
    "california_housing_dataframe = pd.read_csv(\"https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv\", sep=\",\")\n",
    "\n",
    "california_housing_dataframe = california_housing_dataframe.reindex(\n",
    "    np.random.permutation(california_housing_dataframe.index))\n",
    "california_housing_dataframe[\"median_house_value\"] /= 1000.0\n",
    "california_housing_dataframe"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "I6kNgrwCO_ms"
   },
   "source": [
    "We'll set up our `plot_to_image` function to convert the matplotlib plot specified by figure to a PNG image"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "sjBzqxL_okLq"
   },
   "outputs": [],
   "source": [
    "def plot_to_image(figure):\n",
    "  \"\"\"Converts the matplotlib plot specified by 'figure' to a PNG image and\n",
    "  returns it. The supplied figure is closed and inaccessible after this call.\"\"\"\n",
    "  # Save the plot to a PNG in memory.\n",
    "  buf = io.BytesIO()\n",
    "  plt.savefig(buf, format='png')\n",
    "  # Closing the figure prevents it from being displayed directly inside\n",
    "  # the notebook.\n",
    "  plt.close(figure)\n",
    "  buf.seek(0)\n",
    "  # Convert PNG buffer to TF image\n",
    "  image = tf.image.decode_png(buf.getvalue(), channels=4)\n",
    "  # Add the batch dimension\n",
    "  image = tf.expand_dims(image, 0)\n",
    "  return image"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Next, we'll define the function for model training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "VgQPftrpHbG3"
   },
   "outputs": [],
   "source": [
    "def fit_model(learning_rate,\n",
    "              steps_per_epoch,\n",
    "              batch_size,\n",
    "              input_feature):\n",
    "  \"\"\"Trains a linear regression model of one feature.\n",
    "  \n",
    "  Args:\n",
    "    learning_rate: A `float`, the learning rate.\n",
    "    steps_per_epoch: A non-zero `int`, the total number of training steps. A training step\n",
    "      consists of a forward and backward pass using a single batch.\n",
    "    batch_size: A non-zero `int`, the batch size.\n",
    "    input_feature: A `string` specifying a column from `california_housing_dataframe`\n",
    "      to use as input feature.\n",
    "  Returns:\n",
    "    A Pandas `DataFrame` containing targets and the corresponding predictions done\n",
    "    after training the model.\n",
    "  \"\"\"\n",
    "  \n",
    "  epochs = 10\n",
    "  features = california_housing_dataframe[[input_feature]].values\n",
    "  label = \"median_house_value\"\n",
    "  labels = california_housing_dataframe[label].values\n",
    "\n",
    "  model = tf.keras.models.Sequential([\n",
    "      tf.keras.layers.Dense(1, activation='linear', kernel_initializer='zeros')\n",
    "  ])\n",
    "  model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate, clipnorm=5.0),\n",
    "                loss='mse',\n",
    "                metrics=[tf.keras.metrics.RootMeanSquaredError()])\n",
    "  \n",
    "  sample = california_housing_dataframe.sample(n=300)\n",
    "  logdir = \"logs/synthetic_features_and_outliers/plots\" + datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",
    "  scalars_logdir = \"logs/synthetic_features_and_outliers/scalars\" + datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",
    "  file_writer = tf.summary.create_file_writer(logdir)\n",
    "  \n",
    "  # Set up to plot the state of our model's line each epoch.\n",
    "  def create_plt_params(feature, label, epochs=10):\n",
    "    colors = [cm.coolwarm(x) for x in np.linspace(-1, 1, epochs)]\n",
    "    return (colors,\n",
    "            (sample[feature].min(), sample[feature].max()),\n",
    "            (0, sample[label].max()))\n",
    "    \n",
    "  def create_figure(feature, label, epochs=10):\n",
    "    figure = plt.figure(figsize=(15, 6))\n",
    "    plt.title(\"Learned Line by Epoch\")\n",
    "    plt.ylabel(label)\n",
    "    plt.xlabel(feature)\n",
    "    plt.scatter(sample[feature], sample[label])\n",
    "    return figure\n",
    "\n",
    "  colors, x_min_max, y_min_max = create_plt_params(input_feature, label, epochs)\n",
    "\n",
    "  def log(epoch, logs):\n",
    "    root_mean_squared_error = logs[\"root_mean_squared_error\"]\n",
    "    print(\"  epoch %02d : %0.2f\" % (epoch, root_mean_squared_error))\n",
    "\n",
    "    weight, bias = [x.flatten()[0] for x in model.layers[0].get_weights()]\n",
    "\n",
    "    # Apply some math to ensure that the data and line are plotted neatly.\n",
    "    y_extents = np.array(y_min_max)\n",
    "    x_extents = (y_extents - bias) / weight\n",
    "    x_extents = np.maximum(np.minimum(x_extents,\n",
    "                                      x_min_max[1]),\n",
    "                           x_min_max[0])\n",
    "    y_extents = weight * x_extents + bias\n",
    "    figure = create_figure(input_feature, label, epochs)\n",
    "    plt.plot(x_extents, y_extents, color=colors[epoch]) \n",
    "    with file_writer.as_default():\n",
    "      tf.summary.image(\"Learned Line by Epoch\",\n",
    "                       plot_to_image(figure),\n",
    "                       step=epoch)\n",
    "      \n",
    "  model_callback = tf.keras.callbacks.LambdaCallback(\n",
    "      on_epoch_end=lambda epoch, logs: log(epoch, logs))\n",
    "  tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=scalars_logdir,\n",
    "                                                       update_freq='epoch')\n",
    "  \n",
    "  print(\"Train model...\")\n",
    "  print(\"RMSE (on training data):\")\n",
    "  history = model.fit(features,\n",
    "            labels,\n",
    "            epochs=epochs,\n",
    "            steps_per_epoch=steps_per_epoch,\n",
    "            batch_size=batch_size,\n",
    "            callbacks=[model_callback, tensorboard_callback],\n",
    "            verbose=0).history\n",
    "  print(\"Model training finished.\")\n",
    "\n",
    "  calibration_data = pd.DataFrame()\n",
    "  calibration_data[\"predictions\"] = model.predict_on_batch(features).flatten()\n",
    "  calibration_data[\"targets\"] = pd.Series(labels)\n",
    "  display(calibration_data.describe())\n",
    "  root_mean_squared_error = history[\"root_mean_squared_error\"][9]\n",
    "  print(\"Final RMSE (on training data): %0.2f\" % root_mean_squared_error)\n",
    "  \n",
    "  return calibration_data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "FJ6xUNVRm-do"
   },
   "source": [
    "## Task 1: Try a Synthetic Feature\n",
    "\n",
    "Both the `total_rooms` and `population` features count totals for a given city block.\n",
    "\n",
    "But what if one city block were more densely populated than another? We can explore how block density relates to median house value by creating a synthetic feature that's a ratio of `total_rooms` and `population`.\n",
    "\n",
    "In the cell below, create a feature called `rooms_per_person`, and use that as the `input_feature` to `train_model()`.\n",
    "\n",
    "What's the best performance you can get with this single feature by tweaking the learning rate? (The better the performance, the better your regression line should fit the data, and the lower\n",
    "the final RMSE should be.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "suyBB7DZnkGj"
   },
   "outputs": [],
   "source": [
    "!rm -rf logs/synthetic_features_and_outliers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "both",
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 535
    },
    "colab_type": "code",
    "id": "5ihcVutnnu1D",
    "outputId": "72603dda-15bd-4aa8-d6d8-e7c1f8cd0ec1"
   },
   "outputs": [],
   "source": [
    "california_housing_dataframe[\"rooms_per_person\"] = (\n",
    "    california_housing_dataframe[\"total_rooms\"] / california_housing_dataframe[\"population\"])\n",
    "\n",
    "calibration_data = fit_model(\n",
    "    learning_rate=0.00005,\n",
    "    steps_per_epoch=500,\n",
    "    batch_size=5,\n",
    "    input_feature=\"rooms_per_person\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from google.datalab.ml import TensorBoard\n",
    "TensorBoard().start('logs/synthetic_features_and_outliers')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "ZjQrZ8mcHFiU"
   },
   "source": [
    "## Task 2: Identify Outliers\n",
    "\n",
    "We can visualize the performance of our model by creating a scatter plot of predictions vs. target values.  Ideally, these would lie on a perfectly correlated diagonal line.\n",
    "\n",
    "Use Pyplot's [`scatter()`](https://matplotlib.org/gallery/shapes_and_collections/scatter.html) to create a scatter plot of predictions vs. targets, using the rooms-per-person model you trained in Task 1.\n",
    "\n",
    "Do you see any oddities?  Trace these back to the source data by looking at the distribution of values in `rooms_per_person`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "s0tiX2gdRe-S"
   },
   "outputs": [],
   "source": [
    "logdir = \"logs/synthetic_features_and_outliers/plots\"\n",
    "file_writer = tf.summary.create_file_writer(logdir + datetime.now().strftime(\"%Y%m%d-%H%M%S\"))\n",
    "\n",
    "figure = plt.figure(figsize=(15, 6))\n",
    "plt.subplot(1, 2, 1)\n",
    "plt.scatter(calibration_data[\"predictions\"], calibration_data[\"targets\"])\n",
    "with file_writer.as_default():\n",
    "  tf.summary.image(\"Predictions vs Targets\",\n",
    "                   plot_to_image(figure),\n",
    "                   step=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "kMQD0Uq3RqTX"
   },
   "source": [
    "The calibration data shows most scatter points aligned to a line. The line is almost vertical, but we'll come back to that later. Right now let's focus on the ones that deviate from the line. We notice that they are relatively few in number.\n",
    "\n",
    "If we plot a histogram of `rooms_per_person`, we find that we have a few outliers in our input data:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "POTM8C_ER1Oc"
   },
   "outputs": [],
   "source": [
    "figure = plt.figure()\n",
    "plt.subplot(1, 2, 2)\n",
    "_ = california_housing_dataframe[\"rooms_per_person\"].hist()\n",
    "with file_writer.as_default():\n",
    "  tf.summary.image(\"Rooms per person\",\n",
    "                   plot_to_image(figure),\n",
    "                   step=0)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "TensorBoard().start('logs/synthetic_features_and_outliers')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "9l0KYpBQu8ed"
   },
   "source": [
    "## Task 3: Clip Outliers\n",
    "\n",
    "See if you can further improve the model fit by setting the outlier values of `rooms_per_person` to some reasonable minimum or maximum.\n",
    "\n",
    "For reference, here's a quick example of how to apply a function to a Pandas `Series`:\n",
    "\n",
    "    clipped_feature = my_dataframe[\"my_feature_name\"].apply(lambda x: max(x, 0))\n",
    "\n",
    "The above `clipped_feature` will have no values less than `0`."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "8YGNjXPaSMPV"
   },
   "source": [
    "The histogram we created in Task 2 shows that the majority of values are less than `5`. Let's clip `rooms_per_person` to 5, and plot a histogram to double-check the results."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "_ypglcLvULjS"
   },
   "outputs": [],
   "source": [
    "california_housing_dataframe[\"rooms_per_person\"] = (\n",
    "    california_housing_dataframe[\"rooms_per_person\"]).apply(lambda x: min(x, 5))\n",
    "figure = plt.figure()\n",
    "_ = california_housing_dataframe[\"rooms_per_person\"].hist()\n",
    "with file_writer.as_default():\n",
    "  tf.summary.image(\"Clipped Rooms per person\",\n",
    "                   plot_to_image(figure),\n",
    "                   step=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "TensorBoard().start('logs/synthetic_features_and_outliers')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "vO0e1p_aSgKA"
   },
   "source": [
    "To verify that clipping worked, let's train again and print the calibration data once more:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 535
    },
    "colab_type": "code",
    "id": "ZgSP2HKfSoOH",
    "outputId": "59a2a2fb-2775-46c5-8f32-455652a7954b"
   },
   "outputs": [],
   "source": [
    "calibration_data = fit_model(\n",
    "    learning_rate=0.05,\n",
    "    steps_per_epoch=1000,\n",
    "    batch_size=5,\n",
    "    input_feature=\"rooms_per_person\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "gySE-UgfSony"
   },
   "outputs": [],
   "source": [
    "file_writer = tf.summary.create_file_writer(logdir + datetime.now().strftime(\"%Y%m%d-%H%M%S\"))\n",
    "figure = plt.figure()\n",
    "_ = plt.scatter(calibration_data[\"predictions\"], calibration_data[\"targets\"])\n",
    "with file_writer.as_default():\n",
    "  tf.summary.image(\"Predictions vs Targets\",\n",
    "                   plot_to_image(figure),\n",
    "                   step=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "TensorBoard().start('logs/synthetic_features_and_outliers')"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [
    "JndnmDMp66FL",
    "i5Ul3zf5QYvW",
    "WvgxW0bUSC-c"
   ],
   "name": "synthetic_features_and_outliers_with_tf2_and_keras_plus_tensorboard.ipynb",
   "provenance": [],
   "version": "0.3.2"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
